In [1]:
import pandas as pd
import numpy as np

import seaborn as sn
import plotly.express as px
import plotly.offline as pyo
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',None)

import os
In [2]:
df = pd.read_csv('Chennai houseing sale.csv')
df.head()
Out[2]:
PRT_ID AREA INT_SQFT DATE_SALE DIST_MAINROAD N_BEDROOM N_BATHROOM N_ROOM SALE_COND PARK_FACIL DATE_BUILD BUILDTYPE UTILITY_AVAIL STREET MZZONE QS_ROOMS QS_BATHROOM QS_BEDROOM QS_OVERALL REG_FEE COMMIS SALES_PRICE
0 P03210 Karapakkam 1004 04-05-2011 131 1.0 1.0 3 AbNormal Yes 15-05-1967 Commercial AllPub Paved A 4.0 3.9 4.9 4.330 380000 144400 7600000
1 P09411 Anna Nagar 1986 19-12-2006 26 2.0 1.0 5 AbNormal No 22-12-1995 Commercial AllPub Gravel RH 4.9 4.2 2.5 3.765 760122 304049 21717770
2 P01812 Adyar 909 04-02-2012 70 1.0 1.0 3 AbNormal Yes 09-02-1992 Commercial ELO Gravel RL 4.1 3.8 2.2 3.090 421094 92114 13159200
3 P05346 Velachery 1855 13-03-2010 14 3.0 2.0 5 Family No 18-03-1988 Others NoSewr Paved I 4.7 3.9 3.6 4.010 356321 77042 9630290
4 P06210 Karapakkam 1226 05-10-2009 84 1.0 1.0 3 AbNormal Yes 13-10-1979 Others AllPub Gravel C 3.0 2.5 4.1 3.290 237000 74063 7406250
In [4]:
df.shape
Out[4]:
(7109, 22)
In [5]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7109 entries, 0 to 7108
Data columns (total 22 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   PRT_ID         7109 non-null   object 
 1   AREA           7109 non-null   object 
 2   INT_SQFT       7109 non-null   int64  
 3   DATE_SALE      7109 non-null   object 
 4   DIST_MAINROAD  7109 non-null   int64  
 5   N_BEDROOM      7108 non-null   float64
 6   N_BATHROOM     7104 non-null   float64
 7   N_ROOM         7109 non-null   int64  
 8   SALE_COND      7109 non-null   object 
 9   PARK_FACIL     7109 non-null   object 
 10  DATE_BUILD     7109 non-null   object 
 11  BUILDTYPE      7109 non-null   object 
 12  UTILITY_AVAIL  7109 non-null   object 
 13  STREET         7109 non-null   object 
 14  MZZONE         7109 non-null   object 
 15  QS_ROOMS       7109 non-null   float64
 16  QS_BATHROOM    7109 non-null   float64
 17  QS_BEDROOM     7109 non-null   float64
 18  QS_OVERALL     7061 non-null   float64
 19  REG_FEE        7109 non-null   int64  
 20  COMMIS         7109 non-null   int64  
 21  SALES_PRICE    7109 non-null   int64  
dtypes: float64(6), int64(6), object(10)
memory usage: 1.2+ MB

Check null values¶

In [6]:
df.isnull().sum()
Out[6]:
PRT_ID            0
AREA              0
INT_SQFT          0
DATE_SALE         0
DIST_MAINROAD     0
N_BEDROOM         1
N_BATHROOM        5
N_ROOM            0
SALE_COND         0
PARK_FACIL        0
DATE_BUILD        0
BUILDTYPE         0
UTILITY_AVAIL     0
STREET            0
MZZONE            0
QS_ROOMS          0
QS_BATHROOM       0
QS_BEDROOM        0
QS_OVERALL       48
REG_FEE           0
COMMIS            0
SALES_PRICE       0
dtype: int64
Replacing the Null Values with mean values of the data using Simple Imputer¶
In [9]:
from sklearn.impute import SimpleImputer

imputer=SimpleImputer(missing_values=np.nan,strategy='mean',fill_value=None)

df['N_BEDROOM'] = imputer.fit_transform(df[['N_BEDROOM']])
df['N_BATHROOM'] = imputer.fit_transform(df[['N_BATHROOM']])
df['QS_OVERALL'] = imputer.fit_transform(df[['QS_OVERALL']])
Lets see the unique values of columns¶
In [10]:
for col in df.columns:
    if df[col].dtype=='object':
        print()
        print(col)
        print(df[col].unique())
PRT_ID
['P03210' 'P09411' 'P01812' ... 'P09594' 'P06508' 'P09794']

AREA
['Karapakkam' 'Anna Nagar' 'Adyar' 'Velachery' 'Chrompet' 'KK Nagar'
 'TNagar' 'T Nagar' 'Chrompt' 'Chrmpet' 'Karapakam' 'Ana Nagar' 'Chormpet'
 'Adyr' 'Velchery' 'Ann Nagar' 'KKNagar']

DATE_SALE
['04-05-2011' '19-12-2006' '04-02-2012' ... '28-03-2014' '25-08-2006'
 '13-07-2005']

SALE_COND
['AbNormal' 'Family' 'Partial' 'AdjLand' 'Normal Sale' 'Ab Normal'
 'Partiall' 'Adj Land' 'PartiaLl']

PARK_FACIL
['Yes' 'No' 'Noo']

DATE_BUILD
['15-05-1967' '22-12-1995' '09-02-1992' ... '01-09-1978' '11-08-1977'
 '24-07-1961']

BUILDTYPE
['Commercial' 'Others' 'Other' 'House' 'Comercial']

UTILITY_AVAIL
['AllPub' 'ELO' 'NoSewr ' 'NoSeWa' 'All Pub']

STREET
['Paved' 'Gravel' 'No Access' 'Pavd' 'NoAccess']

MZZONE
['A' 'RH' 'RL' 'I' 'C' 'RM']
Cleaning the Data¶
In [11]:
df.AREA.replace(['Ana Nagar','Ann Nagar'],'Anna Nagar',inplace=True)
df.AREA.replace('Karapakkam','Karapakam',inplace=True)
df.AREA.replace(['Chrompt','Chrmpet','Chormpet'],'Chrompet',inplace=True)
df.AREA.replace('KKNagar','KK Nagar',inplace=True)
df.AREA.replace('TNagar','T Nagar',inplace=True)
df.AREA.replace('Adyr','Adyar',inplace=True)
df.AREA.replace('Velchery','Velachery',inplace=True)

df.BUILDTYPE.replace('Comercial','Commercial',inplace=True)
df.BUILDTYPE.replace('Other','Others',inplace=True)

df.UTILITY_AVAIL.replace('AllPub','All Pub',inplace=True)
df.UTILITY_AVAIL.replace(['NoSewr','NoSewr'],'NoSeWa',inplace=True)

df.SALE_COND.replace('Ab Normal','AbNormal',inplace=True)
df.SALE_COND.replace(['PartiaLl','Partiall'],'Partial',inplace=True)
df.SALE_COND.replace('Adj Land','AdjLand',inplace=True)

df.PARK_FACIL.replace('Noo','No',inplace=True)

df.STREET.replace('Pavd','Paved',inplace=True)
df.STREET.replace('NoAccess','No Access',inplace=True)
Changing the datatype from float to int¶
In [12]:
df.QS_ROOMS=df.QS_ROOMS.astype(int)
df.QS_BATHROOM=df.QS_BATHROOM.astype(int)
df.QS_BEDROOM=df.QS_BEDROOM.astype(int)
df.N_BEDROOM=df.N_BEDROOM.astype(int)
df.N_BATHROOM=df.N_BATHROOM.astype(int)
In [13]:
df.head()
Out[13]:
PRT_ID AREA INT_SQFT DATE_SALE DIST_MAINROAD N_BEDROOM N_BATHROOM N_ROOM SALE_COND PARK_FACIL DATE_BUILD BUILDTYPE UTILITY_AVAIL STREET MZZONE QS_ROOMS QS_BATHROOM QS_BEDROOM QS_OVERALL REG_FEE COMMIS SALES_PRICE
0 P03210 Karapakam 1004 04-05-2011 131 1 1 3 AbNormal Yes 15-05-1967 Commercial All Pub Paved A 4 3 4 4.330 380000 144400 7600000
1 P09411 Anna Nagar 1986 19-12-2006 26 2 1 5 AbNormal No 22-12-1995 Commercial All Pub Gravel RH 4 4 2 3.765 760122 304049 21717770
2 P01812 Adyar 909 04-02-2012 70 1 1 3 AbNormal Yes 09-02-1992 Commercial ELO Gravel RL 4 3 2 3.090 421094 92114 13159200
3 P05346 Velachery 1855 13-03-2010 14 3 2 5 Family No 18-03-1988 Others NoSewr Paved I 4 3 3 4.010 356321 77042 9630290
4 P06210 Karapakam 1226 05-10-2009 84 1 1 3 AbNormal Yes 13-10-1979 Others All Pub Gravel C 3 2 4 3.290 237000 74063 7406250
Stastical Description of Data¶
In [14]:
df.describe()
Out[14]:
INT_SQFT DIST_MAINROAD N_BEDROOM N_BATHROOM N_ROOM QS_ROOMS QS_BATHROOM QS_BEDROOM QS_OVERALL REG_FEE COMMIS SALES_PRICE
count 7109.000000 7109.000000 7109.000000 7109.000000 7109.000000 7109.000000 7109.000000 7109.000000 7109.000000 7109.000000 7109.000000 7.109000e+03
mean 1382.073006 99.603179 1.636939 1.213110 3.688704 3.075538 3.069349 3.056126 3.503254 376938.330708 141005.726544 1.089491e+07
std 457.410902 57.403110 0.802881 0.409534 1.019099 0.872937 0.876024 0.871499 0.525440 143070.662010 78768.093718 3.768603e+06
min 500.000000 0.000000 1.000000 1.000000 2.000000 2.000000 2.000000 2.000000 2.000000 71177.000000 5055.000000 2.156875e+06
25% 993.000000 50.000000 1.000000 1.000000 3.000000 2.000000 2.000000 2.000000 3.130000 272406.000000 84219.000000 8.272100e+06
50% 1373.000000 99.000000 1.000000 1.000000 4.000000 3.000000 3.000000 3.000000 3.503254 349486.000000 127628.000000 1.033505e+07
75% 1744.000000 148.000000 2.000000 1.000000 4.000000 4.000000 4.000000 4.000000 3.880000 451562.000000 184506.000000 1.299390e+07
max 2500.000000 200.000000 4.000000 2.000000 6.000000 5.000000 5.000000 5.000000 4.970000 983922.000000 495405.000000 2.366734e+07
In [15]:
df.AREA.value_counts()
Out[15]:
Chrompet      1702
Karapakam     1366
KK Nagar       997
Velachery      981
Anna Nagar     788
Adyar          774
T Nagar        501
Name: AREA, dtype: int64
In [16]:
df.groupby('AREA',as_index=False).count()
Out[16]:
AREA PRT_ID INT_SQFT DATE_SALE DIST_MAINROAD N_BEDROOM N_BATHROOM N_ROOM SALE_COND PARK_FACIL DATE_BUILD BUILDTYPE UTILITY_AVAIL STREET MZZONE QS_ROOMS QS_BATHROOM QS_BEDROOM QS_OVERALL REG_FEE COMMIS SALES_PRICE
0 Adyar 774 774 774 774 774 774 774 774 774 774 774 774 774 774 774 774 774 774 774 774 774
1 Anna Nagar 788 788 788 788 788 788 788 788 788 788 788 788 788 788 788 788 788 788 788 788 788
2 Chrompet 1702 1702 1702 1702 1702 1702 1702 1702 1702 1702 1702 1702 1702 1702 1702 1702 1702 1702 1702 1702 1702
3 KK Nagar 997 997 997 997 997 997 997 997 997 997 997 997 997 997 997 997 997 997 997 997 997
4 Karapakam 1366 1366 1366 1366 1366 1366 1366 1366 1366 1366 1366 1366 1366 1366 1366 1366 1366 1366 1366 1366 1366
5 T Nagar 501 501 501 501 501 501 501 501 501 501 501 501 501 501 501 501 501 501 501 501 501
6 Velachery 981 981 981 981 981 981 981 981 981 981 981 981 981 981 981 981 981 981 981 981 981

Count of Houses in different Areas in Chennai¶

  • Chrompet Area Has Maximum Houses When Compared to Others
  • T Nagar Area Has Less Houses in Chennai
In [17]:
fig = px.pie(df.groupby('AREA',as_index=False).count(),
       names='AREA',
       values='PRT_ID',
       #labels={'PRT_ID':'Count'},
       template='plotly_dark',
       color_discrete_sequence=px.colors.sequential.Plasma,
       hole=.5,
       title='<b> Houses Count in different Areas of Chennai<b>'
      )

fig.show()
pyo.plot(fig, filename = 'Houses Count in different Areas of Chennai.html', auto_open = False)
23.9%19.2%14%13.8%11.1%10.9%7.05%
ChrompetKarapakamKK NagarVelacheryAnna NagarAdyarT Nagar Houses Count in different Areas of Chennai
plotly-logomark
Out[17]:
'Houses Count in different Areas of Chennai.html'

How the Sales Price of Houses varies with Different Areas in Chennai ?¶

  • T Nagar and Anna Nagar Area Houses Has Maximum Sales Price and KK Nagar Area Houses are second Max Sales Price
  • Karapakam Area Houses has Less Sales Price When Compared to other Area Houses
In [19]:
fig = px.box(df,
       x='AREA',
       y='SALES_PRICE',
       color='AREA',
       template='plotly_dark',
       title='<b> Sales Price of Houses in different Areas')

fig.show()
pyo.plot(fig, filename = 'Sales Price of Houses in different Areas.html', auto_open = False)
KarapakamAnna NagarAdyarVelacheryChrompetKK NagarT Nagar5M10M15M20M
AREAKarapakamAnna NagarAdyarVelacheryChrompetKK NagarT Nagar Sales Price of Houses in different AreasAREASALES_PRICE
plotly-logomark
Out[19]:
'Sales Price of Houses in different Areas.html'

How the Square Foot of the House will Effect the Sales Price?¶

  • Increase in Square Foot of Houses increases the Sales price of Houses
  • As we can see the Houses Related to Same Area Having Almost Similar Square feet Length
  • T Nagar and Anna Nagar Area Houses has Maximum Price Having Square Foot in the Range of 1500 to 2000
  • KK Nagar Area Houses Has high Square foots from 1400 to 2500
In [20]:
fig = px.scatter(df,
           x='INT_SQFT',
           y='SALES_PRICE',
           color='AREA',
           size='INT_SQFT',
           labels={'INT_SQFT':'SQFT'},
           template='plotly_dark',
           title='<b> SQFT Versus SALES PRICE of Houses in Different Areas')

fig.show()
pyo.plot(fig, filename = 'SQFT Versus SALES PRICE of Houses in Different Areas.html', auto_open = False)
50010001500200025005M10M15M20M25M
AREAKarapakamAnna NagarAdyarVelacheryChrompetKK NagarT Nagar SQFT Versus SALES PRICE of Houses in Different AreasSQFTSALES_PRICE
plotly-logomark
Out[20]:
'SQFT Versus SALES PRICE of Houses in Different Areas.html'

How does Sales Price Depends on the Main Road Distance From the House?¶

  • It Seems Like MainRoad Distance Does Not Affecting the Sales Price of Houses
  • The Price is same for the Houses having shorter Main Road Distance and Larger Main Road Distance
  • So the Main Road Distance is Not influencing More on Sales Price
In [21]:
fig = px.scatter(df,
           x='DIST_MAINROAD',
           y='SALES_PRICE',
           color='AREA',
           size='SALES_PRICE',
           template='plotly_dark',
           title='<b> MAINROAD DISTANCE Vs SALES_PRICE')

fig.show()
pyo.plot(fig, filename = 'MAINROAD DISTANCE Vs SALES_PRICE.html', auto_open = False)
0501001502005M10M15M20M25M
AREAKarapakamAnna NagarAdyarVelacheryChrompetKK NagarT Nagar MAINROAD DISTANCE Vs SALES_PRICEDIST_MAINROADSALES_PRICE
plotly-logomark
Out[21]:
'MAINROAD DISTANCE Vs SALES_PRICE.html'

How the Sales Price Depends On the Build Date and Sale Date of Houses ?¶

In [22]:
fig =px.scatter(df,
               x='DATE_BUILD',
               y='SALES_PRICE',
               color='AREA',
               size='SALES_PRICE',
               template='plotly_dark',
               title='<b> Build Date Vs Sales Price')
fig.show()
pyo.plot(fig, filename = 'Build Date Vs Sales Price.html', auto_open = False)

fig=px.scatter(df,
               x='DATE_SALE',
               y='SALES_PRICE',
               color='AREA',
               size='SALES_PRICE',
               template='plotly_dark',
               title='<b> Sale Date Vs Sales Price')
fig.show()
pyo.plot(fig, filename = 'Sale Date Vs Sales Price.html', auto_open = False)
15-05-196711-10-199603-04-197815-03-196923-06-200619-12-199425-08-198006-10-197405-08-196726-05-198102-05-198304-06-200422-01-199418-04-198115-03-198916-10-198711-08-197530-04-198504-09-199629-01-197623-12-197925-07-196713-07-197326-10-198813-04-198325-09-198021-08-197516-07-200401-03-199522-10-198623-02-199030-03-199511-02-199021-03-199703-04-199527-08-198006-03-198724-09-198811-06-197222-02-198208-03-199723-10-199405-07-199521-03-199919-11-199031-08-200225-02-200610-11-200319-02-199306-03-197325-12-199624-01-197818-08-200524-05-199520-07-199702-11-198528-12-200418-07-199621-01-199225-05-200625-07-199721-11-199705-07-200107-07-199918-02-198812-08-199009-04-199922-01-198928-06-200230-03-20015M10M15M20M25M
AREAKarapakamAnna NagarAdyarVelacheryChrompetKK NagarT Nagar Build Date Vs Sales PriceDATE_BUILDSALES_PRICE
plotly-logomark
04-05-201110-04-201103-08-201025-03-201211-12-201029-10-201112-04-201024-09-201028-11-200930-01-201230-10-201130-06-201001-07-201019-05-201122-07-201119-06-201129-12-201001-03-201028-03-201012-02-201507-01-200816-12-200722-08-201221-07-200829-11-201217-08-200718-11-201211-03-200905-06-201412-05-201420-08-201119-08-201116-05-201114-12-200604-08-200804-11-200727-02-201104-01-200831-07-201122-04-200712-12-200413-06-200526-01-200517-02-200612-07-200402-06-200614-05-200626-05-200822-06-200518-01-200721-12-201217-09-200817-08-201013-02-201504-12-200716-08-201312-03-201405-05-201323-10-200606-12-201320-08-201411-08-201315-04-200702-08-200710-03-200917-08-200822-03-200905-07-201431-01-20085M10M15M20M25M
AREAKarapakamAnna NagarAdyarVelacheryChrompetKK NagarT Nagar Sale Date Vs Sales PriceDATE_SALESALES_PRICE
plotly-logomark
Out[22]:
'Sale Date Vs Sales Price.html'

How the Sales Price Depends On Number of Rooms ?¶

  • As we can see increase in Rooms Count increases the Sales Price of Houses
  • Houses having total Four and Five Rooms in the areas of Anna Nagar,Velacherry,KK Nagar and T Nagar Has Maximum Sales Price
  • Karapakam,Adyar and Chrompet Area Houses having Two to Four Rooms but four room houses has maximum sales price when compared to others
  • Anna Nagar,Velacherry and T Nagar Areas consists of Four to Five Room Houses
  • KK Nagar Area Houses consists of Two to six Rooms and the price incresases with increase in the Room Count
In [23]:
fig = px.box(df,
       x='N_ROOM',
       y='SALES_PRICE',
       color='AREA',
       template='plotly_dark',
       title='<b>Total Rooms Versus Sales Price of Houses in Different Areas')

fig.show()
pyo.plot(fig, filename = 'Total Rooms Versus Sales Price of Houses in Different Areas.html', auto_open = False)
234565M10M15M20M
AREAKarapakamAnna NagarAdyarVelacheryChrompetKK NagarT NagarTotal Rooms Versus Sales Price of Houses in Different AreasN_ROOMSALES_PRICE
plotly-logomark
Out[23]:
'Total Rooms Versus Sales Price of Houses in Different Areas.html'

How the Sales Price Depends On Number of Bed Rooms ?¶

  • Increase in Bed Rooms Increases the Sales Price
  • Anna Nagar,T Nagar,Adyar,Karapakam and Chrompet Area Houses consists of one to two bedrooms but two bedroom houses has maximum sales price in these areas
  • KK Nagar Area Consists of Houses having one to four bedrooms and Sales price increases with increase in bedrooms
  • Velacherry Area Houses Consists of two to three Bedrooms
  • Out of all Anna Nagar and T Nagar Area Houses with two Bedrooms has Maximum Sales Price
In [32]:
fig = px.box(df,
                 x='N_BEDROOM',
                 #y='AREA',
                 y='SALES_PRICE',
                 color = 'AREA',
                 template='plotly_dark',
                 title='<b>Total BedRooms Versus Sales Price of Houses in Different Areas')

fig.show()
pyo.plot(fig, filename = 'Total BedRooms Versus Sales Price of Houses in Different Areas.html', auto_open = False)
12345M10M15M20M
AREAKarapakamAnna NagarAdyarVelacheryChrompetKK NagarT NagarTotal BedRooms Versus Sales Price of Houses in Different AreasN_BEDROOMSALES_PRICE
plotly-logomark
Out[32]:
'Total BedRooms Versus Sales Price of Houses in Different Areas.html'

How the Sales Price Depends On the Number of Bath Rooms ?¶

  • Increase in Bathrooms Increases the Sales Price in all Areas
  • Anna Nagar and T Nagar Area houses Having only one Bathroom Has Maximum Sales Price when Compared to other Areas
In [33]:
fig = px.box(df,
              x='N_BATHROOM',
              y='SALES_PRICE',
              color='AREA',
              template='plotly_dark',
              title='<b>Total BathRooms Versus Sales Price of Houses in Different Areas')

fig.show()
pyo.plot(fig, filename = 'Total BedRooms Versus Sales Price of Houses in Different Areas.html', auto_open = False)
125M10M15M20M
AREAKarapakamAnna NagarAdyarVelacheryChrompetKK NagarT NagarTotal BathRooms Versus Sales Price of Houses in Different AreasN_BATHROOMSALES_PRICE
plotly-logomark
Out[33]:
'Total BedRooms Versus Sales Price of Houses in Different Areas.html'

How the Sales Price Depends On the Sale Condion of the Houses?¶

  • Sales Price is Maximum for Normal, Family and AbNormal conditions of Sale
In [34]:
fig = px.box(df,
       x='SALE_COND',
       y='SALES_PRICE',
       color='AREA',
       template='plotly_dark',
       title='<b>Sale Condition Versus Sales Price of Houses in Different Areas')

fig.show()
pyo.plot(fig, filename = 'Sale Condition Versus Sales Price of Houses in Different Areas.html', auto_open = False)
AbNormalNormal SaleAdjLandPartialFamily5M10M15M20M
AREAKarapakamAnna NagarAdyarVelacheryChrompetKK NagarT NagarSale Condition Versus Sales Price of Houses in Different AreasSALE_CONDSALES_PRICE
plotly-logomark
Out[34]:
'Sale Condition Versus Sales Price of Houses in Different Areas.html'

How the Sales Price Depends On the Parking Facility of the Houses?¶

  • Sales Price is Maximum for the Houses having Parking Facility in Different Areas
In [35]:
fig = px.box(df,
       x='PARK_FACIL',
       y='SALES_PRICE',
       color='AREA',
       template='plotly_dark',
       title='<b>Parking Facility Versus Sales Price of Houses in Different Areas')


fig.show()
pyo.plot(fig, filename = 'Parking Facility Versus Sales Price of Houses in Different Areas.html', auto_open = False)
YesNo5M10M15M20M
AREAKarapakamAnna NagarAdyarVelacheryChrompetKK NagarT NagarParking Facility Versus Sales Price of Houses in Different AreasPARK_FACILSALES_PRICE
plotly-logomark
Out[35]:
'Parking Facility Versus Sales Price of Houses in Different Areas.html'

How the Sales Price Depends On the Build Type of the Houses?¶

  • Sales Price is Maximum for the Houses having Commercial Build Type in Different Areas
In [36]:
fig = px.box(df,
       x='BUILDTYPE',
       y='SALES_PRICE',
       color='AREA',
       template='plotly_dark',
       title='<b>Build Type Versus Sales Price of Houses in Different Areas')

fig.show()
pyo.plot(fig, filename = 'Build Type Versus Sales Price of Houses in Different Areas.html', auto_open = False)
CommercialOthersHouse5M10M15M20M
AREAKarapakamAnna NagarAdyarVelacheryChrompetKK NagarT NagarBuild Type Versus Sales Price of Houses in Different AreasBUILDTYPESALES_PRICE
plotly-logomark
(Commercial, max: 20.24636M)(Commercial, upper fence: 20.24636M)(Commercial, q3: 17.6129M)KK Nagar(Commercial, median: 16.19731M)(Commercial, q1: 13.94082M)(Commercial, lower fence: 11.28672M)(Commercial, min: 11.28672M)
Out[36]:
'Build Type Versus Sales Price of Houses in Different Areas.html'

How the Sales Price Depends On the Street Type of the Houses?¶

  • Sales Price is almost equal for all the Houses Having different Streets in different Areas
In [37]:
fig = px.box(df,
       x='STREET',
       y='SALES_PRICE',
       color='AREA',
       template='plotly_dark',
       title='<b> Street Versus Sales Price of Houses in Different Areas')

fig.show()
pyo.plot(fig, filename = 'Street Versus Sales Price of Houses in Different Areas.html', auto_open = False)
PavedGravelNo Access5M10M15M20M
AREAKarapakamAnna NagarAdyarVelacheryChrompetKK NagarT Nagar Street Versus Sales Price of Houses in Different AreasSTREETSALES_PRICE
plotly-logomark
Out[37]:
'Street Versus Sales Price of Houses in Different Areas.html'

Correlation¶

In [38]:
df.corr()
Out[38]:
INT_SQFT DIST_MAINROAD N_BEDROOM N_BATHROOM N_ROOM QS_ROOMS QS_BATHROOM QS_BEDROOM QS_OVERALL REG_FEE COMMIS SALES_PRICE
INT_SQFT 1.000000 0.002022 0.786263 0.515290 0.951279 0.016146 -0.011463 0.005847 0.013989 0.657544 0.571076 0.612125
DIST_MAINROAD 0.002022 1.000000 -0.002723 0.001850 0.002301 0.013050 -0.021857 0.002757 -0.017255 0.011600 0.010994 0.018783
N_BEDROOM 0.786263 -0.002723 1.000000 0.755207 0.840209 0.011434 -0.008603 0.014449 0.014245 0.455380 0.430446 0.330987
N_BATHROOM 0.515290 0.001850 0.755207 1.000000 0.568541 0.010452 -0.011005 0.013390 0.007961 0.260244 0.256726 0.108834
N_ROOM 0.951279 0.002301 0.840209 0.568541 1.000000 0.013468 -0.009066 0.012072 0.015418 0.630932 0.533343 0.602760
QS_ROOMS 0.016146 0.013050 0.011434 0.010452 0.013468 1.000000 0.001796 0.009590 0.489271 0.014614 0.007072 0.016059
QS_BATHROOM -0.011463 -0.021857 -0.008603 -0.011005 -0.009066 0.001796 1.000000 -0.011917 0.513585 -0.014777 -0.008397 -0.019469
QS_BEDROOM 0.005847 0.002757 0.014449 0.013390 0.012072 0.009590 -0.011917 1.000000 0.598154 0.018875 0.019376 0.015044
QS_OVERALL 0.013989 -0.017255 0.014245 0.007961 0.015418 0.489271 0.513585 0.598154 1.000000 0.022410 0.016946 0.020485
REG_FEE 0.657544 0.011600 0.455380 0.260244 0.630932 0.014614 -0.014777 0.018875 0.022410 1.000000 0.659903 0.878148
COMMIS 0.571076 0.010994 0.430446 0.256726 0.533343 0.007072 -0.008397 0.019376 0.016946 0.659903 1.000000 0.626275
SALES_PRICE 0.612125 0.018783 0.330987 0.108834 0.602760 0.016059 -0.019469 0.015044 0.020485 0.878148 0.626275 1.000000

Pairwise Correlation of All Columns in the Data¶

In [39]:
plt.figure(figsize=(12,6))
sn.heatmap(df.corr(),annot=True,cmap='CMRmap')
plt.title('Heatmap of the Data')
plt.show()
In [40]:
# Creating a Back up File
df1=df.copy()

Coverting the labels into a numeric form using Label Encoder¶

In [41]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
In [42]:
for col in df.columns:
    if df[col].dtype=='object':
        df[col]=le.fit_transform(df[col])
In [43]:
df.head()
Out[43]:
PRT_ID AREA INT_SQFT DATE_SALE DIST_MAINROAD N_BEDROOM N_BATHROOM N_ROOM SALE_COND PARK_FACIL DATE_BUILD BUILDTYPE UTILITY_AVAIL STREET MZZONE QS_ROOMS QS_BATHROOM QS_BEDROOM QS_OVERALL REG_FEE COMMIS SALES_PRICE
0 2266 4 1004 310 131 1 1 3 0 1 2731 0 0 2 0 4 3 4 4.330 380000 144400 7600000
1 6664 1 1986 1745 26 2 1 5 0 0 4226 0 0 0 3 4 4 2 3.765 760122 304049 21717770
2 1270 0 909 288 70 1 1 3 0 1 1511 0 1 0 4 4 3 2 3.090 421094 92114 13159200
3 3755 6 1855 1117 14 3 2 5 2 0 3300 2 3 2 2 4 3 3 4.010 356321 77042 9630290
4 4393 4 1226 442 84 1 1 3 0 1 2406 2 0 0 1 3 2 4 3.290 237000 74063 7406250
storing the Dependent Variables in X and Independent Variable in Y¶
In [44]:
x=df.drop('SALES_PRICE',axis=1)
y=df['SALES_PRICE']

Splitting the Data into Training set and Testing Set¶

In [45]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=42)
x_train.shape,x_test.shape,y_train.shape,y_test.shape
Out[45]:
((5331, 21), (1778, 21), (5331,), (1778,))

Scaling the values to convert the int values to Machine Languages¶

In [46]:
from sklearn.preprocessing import MinMaxScaler
mmscaler=MinMaxScaler(feature_range=(0,1))
ms = mmscaler.fit(x_train)

x_train = ms.transform(x_train)
x_test = ms.transform(x_test)
x_train = pd.DataFrame(x_train)
x_test = pd.DataFrame(x_test)
In [47]:
a = {'Model Name':[], 
     'Mean_Absolute_Error_MAE':[] ,
     'Adj_R_Square':[] ,
     'Root_Mean_Squared_Error_RMSE':[] ,
     'Mean_Absolute_Percentage_Error_MAPE':[] ,
     'Mean_Squared_Error_MSE':[] ,
     'Root_Mean_Squared_Log_Error_RMSLE':[] ,
     'R2_score':[]}

Results=pd.DataFrame(a)
Results.head()
Out[47]:
Model Name Mean_Absolute_Error_MAE Adj_R_Square Root_Mean_Squared_Error_RMSE Mean_Absolute_Percentage_Error_MAPE Mean_Squared_Error_MSE Root_Mean_Squared_Log_Error_RMSLE R2_score

Build the Regression / Regressor models¶

In [49]:
# Import the library
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import xgboost as xgb
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
In [50]:
lg = LinearRegression()
dt = DecisionTreeRegressor()
rf = RandomForestRegressor()
SVR = SVR()
xgbr = xgb.XGBRegressor()
KNN = KNeighborsRegressor(n_neighbors=5)
ETR = ExtraTreesRegressor()

Model = [lg, dt, rf, SVR,KNN, ETR, xgbr]
In [51]:
model_GBR = GradientBoostingRegressor(loss='ls', learning_rate=0.1, n_estimators=100, subsample=1.0,
                                     criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1,
                                     min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0,
                                     init=None, random_state=None, max_features=None,
                                     alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False,
                                     validation_fraction=0.1, n_iter_no_change=None, tol=0.0001, ccp_alpha=0.0)
In [52]:
# Evalution matrix for all the algorithms

for model in Model:
    
    # Fit the model with train data
    model.fit(x_train, y_train)
    
    # Predict the model with test data
    y_pred = model.predict(x_test)
    
    # Print the model name
    print('Model Name: ', model)
    
    # Evaluation metrics for Regression analysis
    from sklearn import metrics

    print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_test, y_pred),3))  
    print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test, y_pred),3))  
    print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),3))
    print('R2_score:', round(metrics.r2_score(y_test, y_pred),6))
    print('Root Mean Squared Log Error (RMSLE):', round(np.log(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),3))
    
    # Define the function to calculate the MAPE - Mean Absolute Percentage Error
    def MAPE (y_test, y_pred):
        y_test, y_pred = np.array(y_test), np.array(y_pred)
        return np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    
    # Evaluation of MAPE 
    result = MAPE(y_test, y_pred)
    print('Mean Absolute Percentage Error (MAPE):', round(result, 2), '%')
    
    # Calculate Adjusted R squared values 
    r_squared = round(metrics.r2_score(y_test, y_pred),6)
    adjusted_r_squared = round(1 - (1-r_squared)*(len(y)-1)/(len(y)-x.shape[1]-1),6)
    print('Adj R Square: ', adjusted_r_squared)
    print('******************************************************************************************************************')
    #-------------------------------------------------------------------------------------------
    new_row = {'Model Name' : model,
               'Mean_Absolute_Error_MAE' : metrics.mean_absolute_error(y_test, y_pred),
               'Adj_R_Square' : adjusted_r_squared,
               'Root_Mean_Squared_Error_RMSE' : np.sqrt(metrics.mean_squared_error(y_test, y_pred)),
               'Mean_Absolute_Percentage_Error_MAPE' : result,
               'Mean_Squared_Error_MSE' : metrics.mean_squared_error(y_test, y_pred),
               'Root_Mean_Squared_Log_Error_RMSLE': np.log(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),
               'R2_score' : metrics.r2_score(y_test, y_pred)}
    Results = Results.append(new_row, ignore_index=True)
    print()
Model Name:  LinearRegression()
Mean Absolute Error (MAE): 1022554.549
Mean Squared Error (MSE): 1663603131167.479
Root Mean Squared Error (RMSE): 1289807.401
R2_score: 0.875211
Root Mean Squared Log Error (RMSLE): 14.07
Mean Absolute Percentage Error (MAPE): 10.0 %
Adj R Square:  0.874841
******************************************************************************************************************

Model Name:  DecisionTreeRegressor()
Mean Absolute Error (MAE): 766160.228
Mean Squared Error (MSE): 1083848977254.26
Root Mean Squared Error (RMSE): 1041080.678
R2_score: 0.918699
Root Mean Squared Log Error (RMSLE): 13.856
Mean Absolute Percentage Error (MAPE): 7.61 %
Adj R Square:  0.918458
******************************************************************************************************************

Model Name:  RandomForestRegressor()
Mean Absolute Error (MAE): 529879.249
Mean Squared Error (MSE): 451763452981.996
Root Mean Squared Error (RMSE): 672133.508
R2_score: 0.966113
Root Mean Squared Log Error (RMSLE): 13.418
Mean Absolute Percentage Error (MAPE): 5.39 %
Adj R Square:  0.966013
******************************************************************************************************************

Model Name:  SVR()
Mean Absolute Error (MAE): 2830943.966
Mean Squared Error (MSE): 13621656915965.639
Root Mean Squared Error (RMSE): 3690752.893
R2_score: -0.02178
Root Mean Squared Log Error (RMSLE): 15.121
Mean Absolute Percentage Error (MAPE): 28.67 %
Adj R Square:  -0.024808
******************************************************************************************************************

Model Name:  KNeighborsRegressor()
Mean Absolute Error (MAE): 1260718.96
Mean Squared Error (MSE): 2848745691648.293
Root Mean Squared Error (RMSE): 1687822.767
R2_score: 0.786312
Root Mean Squared Log Error (RMSLE): 14.339
Mean Absolute Percentage Error (MAPE): 11.92 %
Adj R Square:  0.785679
******************************************************************************************************************

Model Name:  ExtraTreesRegressor()
Mean Absolute Error (MAE): 369291.913
Mean Squared Error (MSE): 217005195335.877
Root Mean Squared Error (RMSE): 465838.164
R2_score: 0.983722
Root Mean Squared Log Error (RMSLE): 13.052
Mean Absolute Percentage Error (MAPE): 3.82 %
Adj R Square:  0.983674
******************************************************************************************************************

Model Name:  XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)
Mean Absolute Error (MAE): 357348.064
Mean Squared Error (MSE): 213624495167.217
Root Mean Squared Error (RMSE): 462195.3
R2_score: 0.983976
Root Mean Squared Log Error (RMSLE): 13.044
Mean Absolute Percentage Error (MAPE): 3.57 %
Adj R Square:  0.983929
******************************************************************************************************************

In [53]:
Results
Out[53]:
Model Name Mean_Absolute_Error_MAE Adj_R_Square Root_Mean_Squared_Error_RMSE Mean_Absolute_Percentage_Error_MAPE Mean_Squared_Error_MSE Root_Mean_Squared_Log_Error_RMSLE R2_score
0 LinearRegression() 1.022555e+06 0.874841 1.289807e+06 10.004632 1.663603e+12 14.070003 0.875211
1 DecisionTreeRegressor() 7.661602e+05 0.918458 1.041081e+06 7.613756 1.083849e+12 13.855770 0.918699
2 (DecisionTreeRegressor(max_features=1.0, rando... 5.298792e+05 0.966013 6.721335e+05 5.385192 4.517635e+11 13.418212 0.966113
3 SVR() 2.830944e+06 -0.024808 3.690753e+06 28.670171 1.362166e+13 15.121341 -0.021780
4 KNeighborsRegressor() 1.260719e+06 0.785679 1.687823e+06 11.924981 2.848746e+12 14.338950 0.786312
5 (ExtraTreeRegressor(random_state=1390877790), ... 3.692919e+05 0.983674 4.658382e+05 3.816738 2.170052e+11 13.051594 0.983722
6 XGBRegressor(base_score=0.5, booster='gbtree',... 3.573481e+05 0.983929 4.621953e+05 3.570879 2.136245e+11 13.043743 0.983976

From the above Results, the Top 3 Models by Comparing Adjacent R Square Values are¶

  1. ExtraTreesRegressor
  2. XGBRegressor
  3. GradientBoostingRegressor
  • Training and Predicting with XGBRegressor
In [92]:
# Training the Model
xgbr.fit(x_train, y_train) 

# Predict the model with test data
y_pred_xgb = xgbr.predict(x_test)
  • Training and Predicting with ExtraTreesRegressor
In [93]:
# Training the Model
ETR.fit(x_train, y_train) 

# Predict the model with test data
y_pred_ETR = ETR.predict(x_test)
In [94]:
output= pd.DataFrame({'Price_actual':y_test, 'Price_pred (XGB)':y_pred_xgb, 'Price_pred (ETR)': y_pred_ETR})
result=df1.merge(output,left_index=True,right_index=True)

#result.head()
result[['PRT_ID','AREA','Price_actual','Price_pred (XGB)','Price_pred (ETR)']].sample(20)
Out[94]:
PRT_ID AREA Price_actual Price_pred (XGB) Price_pred (ETR)
3289 P04715 Karapakam 8553000 8648174.0 8289827.50
4220 P06076 Velachery 13685190 13847931.0 13796077.40
149 P07490 KK Nagar 12472000 12543332.0 12593784.00
3431 P06582 Karapakam 7866250 8243390.0 7543856.00
380 P09981 Velachery 12350640 13170537.0 12647901.20
6903 P01512 T Nagar 17414970 15758530.0 17159051.00
3197 P06601 Karapakam 8372750 8335412.0 7981913.25
3822 P05119 KK Nagar 16814260 17245898.0 17006137.40
2310 P05308 Karapakam 8008250 7777017.0 7953017.50
6044 P09738 Velachery 14602040 14162924.0 15240464.80
4538 P06888 Anna Nagar 21008600 20199684.0 20767024.00
4836 P05101 Velachery 13160490 13836980.0 13360752.70
3718 P04468 Karapakam 7689500 6666313.0 6443398.75
3317 P00958 Chrompet 10847700 10876618.0 10985148.90
2885 P01370 Chrompet 9190000 9059926.0 8989433.50
1022 P03690 Karapakam 9888000 10544028.0 10901647.65
5938 P03472 Chrompet 8905710 8451058.0 8217775.15
5243 P01754 Karapakam 11837500 11679182.0 11408986.60
5758 P06930 KK Nagar 8545660 9173977.0 8930851.40
2283 P09776 Velachery 15941330 16022495.0 16274440.30
In [99]:
fig = px.scatter(result,
                 x='Price_actual',
                 y='Price_pred (XGB)',
                 trendline='ols',
                 color = 'AREA',
                 template='plotly_dark',
                 title='<b> Actual Price  Vs  Predicted Price with XGBRegressor')
fig.show()

fig = px.scatter(result,
                 x='Price_actual',
                 y='Price_pred (ETR)',
                 trendline='ols',
                 color = 'AREA',
                 template='plotly_dark',
                 title='<b> Actual Price  Vs  Predicted Price with ExtraTreesRegressor')
fig.show()
pyo.plot(fig, filename = 'Actual Price  Vs  Predicted Price.html', auto_open = False)
5M10M15M20M5M10M15M20M
AREAChrompetVelacheryAnna NagarAdyarKarapakamKK NagarT Nagar Actual Price Vs Predicted Price with XGBRegressorPrice_actualPrice_pred (XGB)
plotly-logomark
5M10M15M20M5M10M15M20M
AREAChrompetVelacheryAnna NagarAdyarKarapakamKK NagarT Nagar Actual Price Vs Predicted Price with ExtraTreesRegressorPrice_actualPrice_pred (ETR)
plotly-logomark
Out[99]:
'Actual Price  Vs  Predicted Price.html'